Imports¶

In [127]:
import spacy
import pandas as pd
from tqdm.auto import tqdm 
import swifter
import plotly.express as px
from wordcloud import WordCloud
from matplotlib import pyplot as plt
import textacy
from collections import Counter
import random
import os
import pickle
from matplotlib.pyplot import figure



from pathlib import Path
import ast

pd.options.plotting.backend = "plotly"
random.seed(123)
In [16]:
def cloud_from_lemmas(word_counts):
    wc = WordCloud(width=800, height=400)
    wc.generate_from_frequencies(frequencies=word_counts)
    plt.figure(figsize=(10,8))
    plt.imshow(wc)

def plot_counts(counts):
    fig = px.bar(counts,orientation='h', y='word', x='count')

    fig['layout']['yaxis']['autorange'] = "reversed"
    fig.update_layout(bargap=0.30, font={'size':10})
    return fig
In [2]:
en = spacy.load("en_core_web_sm")
In [40]:
mar02 = pd.read_csv("./first_week/02_mar.csv")
mar01 = pd.read_csv("./first_week/01_mar.csv")
feb28 = pd.read_csv("./first_week/28_feb.csv")
feb27 = pd.read_csv("./first_week/27_feb.csv")
feb26 = pd.read_csv("./first_week/26_feb.csv")
feb25 = pd.read_csv("./first_week/25_feb.csv")
feb24 = pd.read_csv("./first_week/24_feb.csv")

print(len(feb24))
feb24 = feb24.iloc[:48001]

l = [mar02, mar01, feb28, feb27, feb26, feb25, feb24]
1468659
In [4]:
for x in l:
    print(len(x))
48001
48001
48001
48001
48001
48001
48001
In [5]:
feb28
Out[5]:
Unnamed: 0 Datetime Tweet Id Text Username Replies Count Retweets Count Likes Count Quotes Count Language Retweeted Tweet Quoted Tweet Mentioned Users
0 0 2022-02-28 23:59:59+00:00 1498447925306744832 Inside Volodymyr Velenskyy's rise to President... ONEAMERICANMAD1 0 0 0 0 en NaN NaN NaN
1 1 2022-02-28 23:59:59+00:00 1498447925080309762 It's a summer like day, 70F. I'm sitting outsi... TPBookSeries 1 0 1 0 en NaN NaN NaN
2 2 2022-02-28 23:59:59+00:00 1498447923973079044 @wallaceme @GBNEWS It’s more deliberately pern... Ed_Owen 0 0 4 0 en NaN NaN ['wallaceme', 'GBNEWS']
3 3 2022-02-28 23:59:59+00:00 1498447923784343563 Although the Australian Government has asked c... CommsroomC 0 0 1 0 en NaN NaN NaN
4 4 2022-02-28 23:59:59+00:00 1498447923776045060 Although @ausgov has asked citizens not to tra... PublicSpectrum 0 1 2 0 en NaN NaN ['ausgov']
... ... ... ... ... ... ... ... ... ... ... ... ... ...
47996 47996 2022-02-28 22:26:02+00:00 1498424278756200448 @barnes_law The Russians estimated military bu... Bstokesss 1 0 11 0 en NaN NaN ['barnes_law']
47997 47997 2022-02-28 22:26:02+00:00 1498424278739345409 Final Rule Adds Sweeping Restrictions on Expor... LawAnalysis 0 0 0 0 en NaN NaN ['mstockbridgelaw']
47998 47998 2022-02-28 22:26:01+00:00 1498424277451911168 I’d have my animals with me too. #Ukraine http... katya_kowal 0 0 1 0 en NaN https://twitter.com/LorenzoTheCat/status/14983... NaN
47999 47999 2022-02-28 22:26:01+00:00 1498424277393149958 Gadget Game News : Elon Musk’s promised Starli... kalpak_savaliya 0 0 1 0 en NaN NaN NaN
48000 48000 2022-02-28 22:26:01+00:00 1498424277367881733 @Albert_Nobbs ARE WE HAPPY NOW, ALBERTA? CAN O... BanFoolish 1 0 4 0 en NaN NaN ['Albert_Nobbs']

48001 rows × 13 columns

In [6]:
len(feb24)
Out[6]:
48001
In [7]:
df = pd.concat(l)
In [8]:
df
Out[8]:
Unnamed: 0 Datetime Tweet Id Text Username Replies Count Retweets Count Likes Count Quotes Count Language Retweeted Tweet Quoted Tweet Mentioned Users
0 0 2022-03-02 23:59:59+00:00 1499172701289828352 In solidarity with Ukraine it might be time to... adinplore 0 0 0 0 en NaN NaN NaN
1 1 2022-03-02 23:59:59+00:00 1499172700924911621 First Ukraine City Falls as Russia Strikes Mor... JanatakhabarP 0 0 0 0 en NaN NaN NaN
2 2 2022-03-02 23:59:59+00:00 1499172700157263872 There is a good @RealLifeLore22 video on this.... Liam_Holman99 0 2 2 0 en NaN https://twitter.com/dontbrexitfixit/status/149... ['RealLifeLore22']
3 3 2022-03-02 23:59:59+00:00 1499172699003838468 Invading Ukraine was a choice. TELEXDesignCo 0 0 0 0 en NaN NaN NaN
4 4 2022-03-02 23:59:59+00:00 1499172698978762755 @GGraczka @Ukraine That's not what they're say... SiJeDisTakbir 1 0 0 0 en NaN NaN ['GGraczka', 'Ukraine']
... ... ... ... ... ... ... ... ... ... ... ... ... ...
47996 47996 2022-02-24 23:00:43+00:00 1496983457381105671 @niezbyt_babo @Formaela19 @lovingthemwomen @Pe... aartemidee 0 0 1 0 en NaN NaN ['niezbyt_babo', 'Formaela19', 'lovingthemwome...
47997 47997 2022-02-24 23:00:43+00:00 1496983457377001484 @katramdeen Holy shit Katherine, i totally for... RonnyS0L 1 0 0 0 en NaN NaN ['katramdeen']
47998 47998 2022-02-24 23:00:43+00:00 1496983456877797380 @BigBluexlt Coups a comin. Certain G8s want Zs... USAMRIID 1 3 9 1 en NaN NaN ['BigBluexlt']
47999 47999 2022-02-24 23:00:43+00:00 1496983456697491465 @heartshapedwomb right lmao i don't think the ... na74362408 2 0 2 0 en NaN NaN ['heartshapedwomb']
48000 48000 2022-02-24 23:00:43+00:00 1496983456479334400 @hutchinson People are so fucking goddamn stup... Banshee__Baby 0 1 0 0 en NaN NaN ['hutchinson']

336007 rows × 13 columns

mar02¶

In [41]:
mar02 = mar02.iloc[:20000]
In [42]:
df = mar02
In [43]:
df = df.loc[df['Language'] == 'en']
In [44]:
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
In [45]:
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply:   0%|          | 0/19981 [00:00<?, ?it/s]
In [46]:
mar02 = df
In [47]:
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
mar02['lemmas'] = mar02.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
In [48]:
mar02_word_counts = Counter(mar02.lemmas.sum())
In [49]:
mar02_counts = pd.DataFrame(Counter({k: v for k, v in mar02_word_counts.items()}).most_common(60), columns=['word', 'count'])
In [50]:
cloud_from_lemmas(mar02_word_counts)
In [51]:
plot_counts(mar02_counts)
In [52]:
mar02_word_counts
Out[52]:
Counter({'solidarity': 99,
         'Ukraine': 20145,
         'time': 669,
         'remove': 92,
         'color': 34,
         'white': 127,
         'blue': 65,
         'red': 43,
         'thing': 455,
         'represent': 31,
         'russian': 2603,
         'flag': 138,
         'City': 45,
         'Falls': 20,
         'Russia': 5432,
         'strike': 95,
         'civilian': 595,
         '\xa0': 568,
         'target': 194,
         'https://t.co/It0d0ZMv9E': 1,
         'good': 602,
         '@RealLifeLore22': 1,
         'video': 225,
         'Putin': 3246,
         'want': 1124,
         'act': 160,
         'russophilic': 1,
         'buffer': 13,
         'zone': 130,
         'oil': 239,
         'field': 29,
         'strategic': 48,
         'advantage': 12,
         'https://t.co/jppja9ogmG': 1,
         'Invading': 15,
         'choice': 77,
         '@GGraczka': 1,
         '@Ukraine': 477,
         'say': 1197,
         'share': 153,
         'testimony': 5,
         'one': 79,
         'commit': 182,
         'crime': 488,
         'punish': 44,
         'death': 341,
         'penalty': 10,
         'type': 39,
         '@zoltarssg': 1,
         '@orwellandrade': 1,
         '@PredictIt': 1,
         'announce': 80,
         '@SSGamblers': 1,
         'discord': 5,
         'dump': 8,
         'trade': 43,
         '30': 61,
         'probably': 157,
         'bad': 392,
         'bet': 41,
         'place': 215,
         'mean': 343,
         'obvious': 35,
         'recommend': 19,
         'pod': 1,
         'invasion': 1399,
         'shelling': 46,
         'news': 395,
         'report': 291,
         '@POTUS': 257,
         'rewrite': 4,
         'speech': 64,
         'Israel': 149,
         'quiet': 18,
         'think': 985,
         'going': 91,
         'to': 113,
         'appear': 63,
         'move': 84,
         'tactical': 14,
         'battlefield': 11,
         'nuke': 153,
         'close': 224,
         'bode': 1,
         'desperate': 27,
         'people': 2271,
         'action': 219,
         'leave': 403,
         'cornered': 3,
         'RAT': 2,
         'space': 56,
         'run': 157,
         'prepared': 16,
         'kill': 585,
         'https://t.co/gcB6vsj4ek': 1,
         'Victoria': 22,
         'Secret': 17,
         'VSCO': 8,
         'Q4': 8,
         '2021': 32,
         'earning': 12,
         'issue': 181,
         'weak': 69,
         'outlook': 13,
         'quote': 36,
         'https://t.co/rXHO3HWBkP': 1,
         '@usaldak47': 1,
         '@Israel': 20,
         '90': 16,
         'year': 469,
         'travel': 55,
         'past': 106,
         '😥': 18,
         '@blunomatterhoo': 1,
         'find': 241,
         'try': 512,
         'assimilation': 1,
         'idea': 122,
         "didn't": 5,
         'will': 260,
         'man': 329,
         'Toronto': 7,
         'ground': 128,
         'Poland': 280,
         'help': 1225,
         'refugee': 298,
         'direct': 56,
         'Winnipeg': 2,
         'group': 139,
         'collect': 45,
         '1000': 19,
         'kg': 1,
         'humanitarian': 273,
         'aid': 262,
         'radio': 23,
         'diaper': 7,
         'Canada': 154,
         '@nuttallreport': 1,
         '@OmarMosleh': 1,
         'https://t.co/MiIbGFp6QR': 1,
         'Lady': 7,
         'take': 546,
         'stand': 638,
         'amid': 117,
         'Invasion': 138,
         'Panic': 2,
         'Tears': 1,
         'People': 147,
         'https://t.co/pjxrl4lq8l': 1,
         '@da_cheese': 1,
         '@redfishstream': 1,
         'journalist': 46,
         'invade': 869,
         'ukraine': 974,
         '😱': 15,
         'St.': 22,
         'Javelin': 9,
         'vs': 60,
         'Improvised': 2,
         'Armor': 1,
         'Tanks': 6,
         '|': 360,
         'sofrep': 1,
         'https://t.co/x5agxv1zvq': 1,
         'fuck': 147,
         'prayer': 120,
         'UkraineRussianWar': 60,
         'RussiaUkraine': 96,
         'banrussiafromswift': 3,
         'CancelRussia': 4,
         'DefeatPutin': 7,
         'DefendUkraine': 13,
         'FckPutin': 9,
         'FreeUkraine': 14,
         'https://t.co/vfhnqq0jxr': 1,
         '@joncoopertweet': 27,
         '@gop': 12,
         'lead': 196,
         'Trump': 425,
         'blackmail': 27,
         'vote': 505,
         'withhold': 39,
         'support': 1405,
         'make': 285,
         '@DicksonReps': 1,
         '@murpharoo': 1,
         '@joshbutler': 1,
         '@msmarto': 1,
         'wow': 47,
         'west': 190,
         'atrocity': 57,
         'ignore': 72,
         'attack': 578,
         'obviously': 57,
         'lot': 229,
         'war': 3024,
         'definitely': 62,
         'particular': 22,
         'get': 463,
         '@ksidiii': 1,
         'standard': 29,
         'deviation': 1,
         'price': 156,
         'combine': 10,
         'straight': 40,
         'day': 521,
         'trading': 6,
         '10': 89,
         'range': 17,
         'month': 114,
         'like': 1173,
         'penny': 10,
         'stock': 67,
         'embargo': 12,
         'wipe': 35,
         'entire': 82,
         'week': 288,
         'gain': 46,
         'resolution': 237,
         '\u2066@nytimes\u2069': 3,
         '\u2066@doctecazoid\u2069': 1,
         'right': 771,
         'wait': 145,
         'Olympics': 151,
         'https://t.co/pjnszazykh': 1,
         'conflict': 387,
         'put': 87,
         'cyber': 13,
         'warfare': 25,
         'center': 30,
         '-Host': 1,
         '@tteminWFED': 1,
         '@markcmontgomery': 1,
         '@FDD_CCTI': 1,
         ' \n\n': 54,
         'https://t.co/dugszt1h5d': 1,
         '@FederalNewsNet': 1,
         'customer': 9,
         'ask': 419,
         'ukrainian': 669,
         'decal': 1,
         '2': 325,
         'Tryzub': 2,
         '1': 290,
         'great': 229,
         'grandfather': 8,
         'terrible': 40,
         'happen': 562,
         'standwithukraine': 32,
         'https://t.co/t1ZW6pBPbn': 1,
         '@seanhannity': 7,
         'go': 920,
         'sad': 115,
         'leader': 222,
         'assassinate': 17,
         'world': 1031,
         'cry': 55,
         'sick': 30,
         'feeling': 23,
         'feel': 296,
         'fall': 229,
         'save': 172,
         '😢': 33,
         'ukrainerussiawar': 267,
         'dead': 139,
         'soldier': 416,
         'near': 105,
         'Kherson': 161,
         'warning': 22,
         'GRAPHIC': 2,
         'UkraineInvasion': 39,
         'SlavaUkraini': 37,
         'russianukrainianwar': 161,
         'https://t.co/yBXJCcZBAd': 1,
         '@McFaul': 59,
         'mad': 37,
         'simply': 55,
         'bear': 77,
         'wind': 9,
         'hold': 249,
         'card': 19,
         'i.e.': 7,
         'know': 1032,
         'дурак': 1,
         'fool': 29,
         'stupid': 80,
         'etc': 134,
         '\n ': 192,
         'cf': 1,
         'eat': 30,
         'Queen': 3,
         'Spades': 2,
         'fail': 100,
         'bid': 21,
         'shoot': 81,
         'moon': 1,
         'https://t.co/gnwr1eoizz': 1,
         '@zebedy1997': 2,
         '@ffschristie': 1,
         '@seanzjay': 4,
         '@yimbychris': 6,
         '@jeremycorbyn': 27,
         '@stwuk': 24,
         'forget': 133,
         'state': 326,
         'talk': 408,
         'future': 114,
         'fear': 112,
         'direction': 23,
         'course': 84,
         'Europe': 404,
         'look': 427,
         'hate': 114,
         'Libya': 42,
         'Afghanistan': 170,
         'tragedy': 30,
         'far': 221,
         'sadly': 32,
         '@sharonlwa': 1,
         '44': 11,
         'm': 122,
         '7b': 1,
         'Earth': 14,
         'european': 136,
         'NATO': 1282,
         'nuclear': 333,
         '6,956,000,000': 1,
         'horrible': 43,
         'situation': 281,
         'heart': 175,
         'break': 166,
         '@music__bee': 1,
         'pshower': 1,
         'expect': 103,
         'God': 182,
         'Bless': 13,
         'unholy': 1,
         'sean': 1,
         'mug': 3,
         'design': 18,
         'https://t.co/xwhusp2lb2': 1,
         'https://t.co/km7o2xlz3c': 1,
         '@geofflath': 2,
         '@bjoneslaw1972': 4,
         '@Osinttechnical': 5,
         'Russians': 571,
         'Ukrainians': 374,
         'advice': 15,
         'intent': 12,
         'push': 139,
         'propaganda': 221,
         'social': 93,
         'medium': 315,
         'insurgency': 12,
         'care': 295,
         'interested': 21,
         'handle': 25,
         'ally': 130,
         'responsible': 49,
         'crisis': 262,
         'Dr.': 13,
         'John': 41,
         'J.': 2,
         'Mearsheimer': 25,
         'important': 127,
         'understand': 271,
         'West': 243,
         '2008': 23,
         'turn': 191,
         'western': 256,
         'bulwark': 2,
         'border': 331,
         'https://t.co/lrsqefjte7': 1,
         '@3lidw': 2,
         '@mod_russia': 12,
         'believe': 318,
         'win': 246,
         'come': 546,
         'strong': 161,
         'Union': 143,
         'new': 222,
         'friendly': 23,
         'member': 243,
         'tell': 484,
         'school': 67,
         'ScoonTv': 4,
         'https://t.co/7ojjcny5fr': 1,
         'citizen': 254,
         'absolutely': 95,
         'eviscerate': 2,
         'MSM': 29,
         'libtard': 1,
         'virtue': 25,
         'signal': 30,
         'idiotic': 5,
         'family': 313,
         'friend': 206,
         'https://t.co/mUcZm7RRKA': 1,
         'add': 133,
         'narrative': 46,
         'Biden': 600,
         'Divert': 1,
         'Agents': 1,
         'U.S.': 247,
         'Southern': 14,
         'Border': 10,
         'send': 659,
         'Assist': 1,
         'Conflict': 22,
         'https://t.co/t3p8LXPERg': 1,
         '@pamelageller': 1,
         '@cesc_james': 1,
         '@GalekNaughty': 4,
         '@spectatorindex': 46,
         'army': 209,
         'armoured': 5,
         'vehicle': 61,
         'Iraq': 164,
         '2003': 13,
         'Google': 28,
         'participate': 16,
         'link': 56,
         'answer': 79,
         'https://t.co/GdIKv67g78': 1,
         '@DreamLeaf5': 4,
         '@RaccoonLeandro': 3,
         'Right': 13,
         'misremember': 1,
         'specific': 20,
         'sure': 224,
         'agree': 194,
         'appeasement': 12,
         'short': 58,
         'troop': 319,
         'immediately': 79,
         'benefit': 74,
         'tribe': 2,
         'settle': 24,
         'Dnieper': 5,
         'river': 9,
         'Belarus': 159,
         'North': 48,
         'spread': 71,
         'northward': 1,
         'northern': 7,
         'Volga': 1,
         'valley': 1,
         'east': 53,
         'modern': 25,
         'Moscow': 131,
         'basin': 1,
         'Dniester': 1,
         'Buh': 1,
         'present': 41,
         'southern': 34,
         'head': 149,
         'Golda': 1,
         'Meir': 1,
         'watch': 354,
         'Munich': 9,
         'https://t.co/arkswwBtNw': 1,
         'nerve': 10,
         'pro': 119,
         'call': 311,
         'fascist': 60,
         'remember': 163,
         'flight': 51,
         'MH17': 6,
         'separatist': 27,
         'hide': 61,
         'fact': 195,
         'slaughter': 54,
         'nearly': 61,
         '300': 13,
         'international': 143,
         'Jesus': 18,
         'Украина': 5,
         'РоссияСмотри': 1,
         'https://t.co/lq27ie1khm': 1,
         'guy': 225,
         'fight': 788,
         'Moldova': 44,
         'https://t.co/mussysigqc': 1,
         'Iryna': 2,
         'Red': 29,
         'Cross': 18,
         'Fund': 12,
         'https://t.co/ds6bbiwd1e': 1,
         '@crist_aras': 2,
         'u': 144,
         'well': 217,
         'reason': 224,
         'terrorist': 61,
         '@dbrand': 1,
         '@Mrwhosetheboss': 1,
         '@MKBHD': 2,
         's': 89,
         'beg': 26,
         'prize': 4,
         'punishable': 3,
         'worst': 4,
         'frontline': 7,
         '🤣': 100,
         '😂': 176,
         'Street': 18,
         'combat': 41,
         'minute': 62,
         'territorial': 27,
         'defense': 84,
         'liberate': 19,
         'city': 370,
         'russiaukrainewar': 18,
         'explain': 111,
         'https://t.co/jjymqnb7hz': 1,
         '@youtube': 195,
         '@nikkihaley': 3,
         '@repspartz': 4,
         'hell': 95,
         '@europeanpan': 1,
         'self': 68,
         'hater': 1,
         'love': 268,
         'influence': 42,
         'NYT': 19,
         'China': 473,
         'delay': 73,
         'winter': 24,
         'crazy': 62,
         'squeeze': 9,
         '$': 343,
         'Korea': 57,
         '@newsmax': 11,
         'need': 1054,
         '@drew95ca': 1,
         '@jemoole': 1,
         '@DavidAn53897256': 5,
         '@ZelenskyyUa': 184,
         'India': 207,
         'show': 185,
         'decency': 4,
         'refuse': 111,
         'Collabration': 1,
         'way': 559,
         'street': 51,
         '@ECR_CoR': 1,
         'grateful': 16,
         '@eu_cor': 1,
         'dedicated': 4,
         'EU': 395,
         'region': 96,
         'urgent': 38,
         '@EU_CoR': 1,
         'voice': 48,
         'grant': 21,
         'candidate': 11,
         'status': 20,
         'sound.#stoprussianaggression': 1,
         'russiainvadedukraine': 8,
         '🔴': 21,
         'breaking': 48,
         '     \n\n': 1,
         'air': 155,
         'alert': 23,
         'kyiv': 121,
         'mobilize': 7,
         'civil': 48,
         'team': 99,
         'devilputin': 1,
         'fuckputin': 6,
         '  ': 140,
         'https://t.co/K1XQiiTHEr': 1,
         'findyourthe': 1,
         'redbubble': 1,
         '@dagenmcdowell': 2,
         'retirement': 4,
         'America': 220,
         'give': 382,
         'big': 243,
         'HELPFUL': 2,
         'foreign': 112,
         'PUTIN': 65,
         'buy': 179,
         '111': 4,
         'barrel': 23,
         'UKRAINE': 315,
         'PEOPLE': 36,
         'failure': 45,
         'https://t.co/hzbthamn3f': 1,
         'hunter': 8,
         'deal': 115,
         'continue': 242,
         'https://t.co/yZyUNvqmsk': 1,
         'War': 419,
         'https://t.co/6iAHcwPQav': 1,
         'see': 360,
         'countless': 12,
         'feature': 15,
         'story': 108,
         'Molotov': 14,
         'cocktail': 19,
         'scene': 25,
         'classic': 2,
         'nonviolent': 1,
         'resistance': 66,
         'rarely': 7,
         'stunning': 6,
         'effective': 25,
         'tactic': 39,
         'https://t.co/bf3odvmx2a': 1,
         'sunflower': 31,
         'national': 133,
         'flower': 28,
         'let': 510,
         'start': 426,
         'post': 196,
         'ukranian': 47,
         'cover': 90,
         'Sunflowers': 4,
         ' \n': 54,
         'Facebook': 19,
         'surprise': 28,
         'Zuck': 1,
         'allow': 189,
         'https://t.co/klweym3jzs': 1,
         'worry': 72,
         'fmr': 4,
         'Zelenskyy': 75,
         'advisor': 14,
         'https://t.co/f3tqbathwv': 1,
         '@msnbc': 6,
         '@lubimayarussiya': 1,
         '@aaronjmate': 22,
         '@TrumpPres2017': 1,
         'Donbas': 51,
         'Cope': 1,
         'teach': 26,
         'history': 206,
         'actively': 28,
         'live': 384,
         'State': 89,
         'High': 11,
         'student': 232,
         'textbook': 2,
         'tv': 105,
         'screen': 14,
         'inform': 18,
         'https://t.co/ymBDkhWf7i': 1,
         '@mint_4_ukraine': 1,
         'perfect': 21,
         'example': 72,
         'powerful': 45,
         'NFT': 43,
         'work': 326,
         'impact': 79,
         'donate': 336,
         'piece': 61,
         'art': 35,
         'https://t.co/8kqignoofx': 1,
         'orange': 7,
         'cone': 3,
         'stop': 1001,
         'snow': 5,
         'clearing': 2,
         'truck': 30,
         'track': 31,
         'Québec': 1,
         'thousand': 126,
         'embe': 1,
         'grenade': 13,
         'case': 107,
         'tank': 169,
         'thé': 1,
         'respect': 83,
         'deserve': 78,
         'https://t.co/PjCuEYH9Wl': 1,
         '@UptownComCapita': 1,
         '@JackPosobiec': 14,
         'peacekeeper': 9,
         '@housegop': 1,
         'FOUGHT': 1,
         'DEMOCRACY': 5,
         'HARD': 1,
         'invader': 44,
         'Jimmy': 6,
         'Max': 9,
         'yes': 266,
         'Neo': 39,
         'nazi': 227,
         'Recap': 6,
         'w/': 42,
         'Blumenthal': 7,
         'https://t.co/ek3prxnrwu': 1,
         '@chloevtweet': 1,
         'launder': 11,
         'money': 371,
         'got': 68,
         'honey': 2,
         'pot': 6,
         'flow': 13,
         'dry': 10,
         'dime': 5,
         'seize': 82,
         'reconstruction': 12,
         'cost': 76,
         'bit': 76,
         '🌻': 165,
         '🙏': 322,
         'https://t.co/5bwbzep5gm': 1,
         '@burneroftaxe': 2,
         '@NSStr0ng': 1,
         '@GenuineNat': 2,
         'major': 101,
         'cite': 24,
         '@MartinHeinrich': 10,
         'weapon': 582,
         'assistance': 199,
         'defend': 385,
         'innocent': 337,
         '@potu': 238,
         'provide': 306,
         'safeairliftukraine': 138,
         'StopPutin': 184,
         'sir': 38,
         'martin': 5,
         'indifferent': 2,
         '@Uncle_Joe_x': 1,
         '@saracha45158427': 3,
         '@nytimes': 62,
         'Donbass': 39,
         'Crimea': 117,
         'anti': 215,
         'instal': 31,
         'puppet': 76,
         'goverment': 6,
         'https://t.co/T42aBbZ6tH': 1,
         '@CivMilAir': 3,
         'eye': 79,
         'belarus': 2,
         'supply': 218,
         'shipment': 15,
         'ready': 54,
         'warn': 109,
         'security': 117,
         '@hopie93632267': 1,
         '@lancesterling12': 1,
         '12': 19,
         'Palestinians': 30,
         'learn': 103,
         'human': 152,
         'violent': 26,
         'hatred': 18,
         'Tampa': 1,
         'newlywed': 1,
         'urge': 64,
         'speak': 184,
         'https://t.co/wzlmmnc8og': 1,
         'Trey': 4,
         'Yingst': 1,
         '@FoxNews': 44,
         'explosion': 46,
         'Kiyv': 5,
         'Capitol': 15,
         '@niii65919770': 1,
         '@kotnikjanez': 1,
         '@Caucasuswar': 14,
         'lmao': 10,
         'angle': 10,
         'shit': 117,
         'nazis': 26,
         'pick': 49,
         'looter': 8,
         'subject': 23,
         'stockade': 1,
         'cling': 3,
         'wrap': 14,
         'utility': 2,
         'pole': 8,
         'Reddit': 3,
         'https://t.co/Jyytei9MYh': 1,
         '@TimInHonolulu': 9,
         '@whnsc': 5,
         '@DefenseIntel': 10,
         '@DI_Ukraine': 10,
         'use': 281,
         'keep': 109,
         'option': 39,
         '@gopleader': 21,
         'liberal': 32,
         'open': 146,
         'border.we': 1,
         'sotuincrisi': 1,
         'wonder': 115,
         'Iranians': 12,
         'bout': 8,
         'kind': 83,
         'pay': 202,
         'tax': 22,
         'dollar': 44,
         'Crimes': 15,
         'https://t.co/4ypjvajnzn': 1,
         'sympathetic': 10,
         'movement': 30,
         'term': 84,
         'origin': 7,
         'root': 34,
         'fault': 59,
         'differently': 10,
         '5': 136,
         'effort': 98,
         'play': 166,
         'kingmaker': 1,
         'euromaiden': 1,
         'unmitigated': 1,
         'disaster': 35,
         'confirm': 75,
         'Airdrop': 20,
         'receive': 76,
         'Crypto': 64,
         'Donations': 32,
         'https://t.co/bDGssQQCqA': 1,
         'floodgate': 1,
         'racism': 102,
         'weird': 29,
         'drunk': 5,
         'wedding': 3,
         'dgaf': 1,
         'https://t.co/hbsolbzrw7': 1,
         '@ameyaw112': 1,
         'Wob3k': 1,
         'anaa': 1,
         'roman': 27,
         'Abramovich': 91,
         'billionaire': 37,
         'Chelsea': 114,
         'owner': 49,
         'sell': 219,
         'club': 69,
         'https://t.co/ywwznjeotd': 1,
         'House': 125,
         'Resolution': 31,
         'Montana': 17,
         'Rep.': 30,
         'Matt': 32,
         'Rosendale': 46,
         'Kentucky': 19,
         'Thomas': 49,
         'Massie': 65,
         'Arizona': 26,
         'Paul': 60,
         'Gosar': 61,
         'reply': 22,
         'bank': 54,
         'account': 105,
         'suffer': 121,
         'dysfunctional': 1,
         'turmoil': 3,
         'hardship': 8,
         'killing': 23,
         'nonsense': 31,
         '🤷': 31,
         '🏽\u200d': 6,
         '♀': 26,
         '️': 394,
         'https://t.co/CUwfCXkMO0': 1,
         '@MikhailFridman': 1,
         'hear': 203,
         'denounce': 38,
         'power': 235,
         'convince': 32,
         'circle': 12,
         'end': 394,
         'totally': 49,
         'unjustified': 6,
         'pariah': 11,
         '👉': 16,
         'Fresh': 1,
         'channel': 36,
         'https://t.co/hwVsGexSMM': 1,
         'News': 249,
         'Dogecoin': 44,
         'community': 103,
         '53': 21,
         'k': 54,
         'country': 1533,
         'hint': 21,
         'upcoming': 18,
         'airdrop': 45,
         '\n\n ': 64,
         'GEMs': 1,
         'https://t.co/znaxtdiyaj': 1,
         'onlygem': 1,
         'btc': 9,
         'eth': 10,
         'bnb': 3,
         'feg': 1,
         'nft': 20,
         '@BeattieDoug': 1,
         'typical': 12,
         'repulsive': 3,
         'hypocrisy': 29,
         'familiar': 10,
         'moral': 52,
         'equivalence': 4,
         'campaign': 56,
         'blood': 64,
         'child': 318,
         'hand': 168,
         'atone': 1,
         'official': 229,
         'Kyiv': 243,
         'loud': 26,
         'beginning': 18,
         'UkraineRussiaWar': 38,
         'UkraineUnderAttack': 50,
         'https://t.co/LfzipjKrhW': 1,
         'syrian': 7,
         'foul': 1,
         'racist': 100,
         'coverage': 67,
         'CBC': 10,
         'https://t.co/lnp9ibw3eu': 1,
         'catch': 44,
         'athlete': 20,
         'sport': 31,
         'organization': 54,
         'respond': 48,
         'violence': 55,
         'https://t.co/dhDXYnXvCN': 1,
         '@Benjami54553803': 3,
         '@jamwood20': 2,
         '@covid19_murder': 6,
         '@unionjock1': 2,
         '@Sniper_Wolf5': 7,
         'americans': 6,
         'fighting': 72,
         '@jaycybersecuri1': 1,
         '@auraalborn': 2,
         '@astrosoul9': 3,
         '@ADanielHill': 4,
         'scam': 20,
         'raise': 117,
         '56,675': 1,
         'asset': 92,
         'donation': 166,
         'enter': 68,
         'giveaway': 1,
         'certify': 3,
         'government': 390,
         'site': 42,
         'hope': 348,
         'cause': 193,
         '❤': 181,
         'instruction': 7,
         'spirit': 28,
         'https://t.co/hHp4KNSOj3': 1,
         '@IAPonomarenko': 86,
         'hero': 81,
         'limb': 2,
         'compliment': 2,
         '@SenToddYoung': 2,
         'commentary': 10,
         'precise': 4,
         'measure': 39,
         'senator': 11,
         'partisan': 11,
         'dig': 16,
         'bait': 11,
         'criticism': 17,
         'fare': 1,
         'beneath': 3,
         'dignity': 6,
         'office': 52,
         'https://t.co/mBxBJVj0KX': 1,
         '@JeffSchogol': 1,
         'Shit': 2,
         '@chinahand': 1,
         'decade': 80,
         'heartless': 6,
         'https://t.co/u5o90ieqsk': 1,
         '@cranky_yankee': 1,
         'Zelensky': 210,
         'iconic': 4,
         'movie': 25,
         'life': 332,
         'evacuate': 57,
         'meme': 17,
         'standing': 8,
         'ovation': 10,
         'damn': 40,
         '@potus': 16,
         'https://t.co/IRnsT6XLDY': 1,
         '@HonNonsoNwankwo': 1,
         '@transferchecker': 1,
         'staying': 1,
         'neutral': 70,
         'favour': 21,
         'Finland': 73,
         'swiss': 4,
         'soon': 156,
         'later': 39,
         ...})
In [ ]:
 

mar01¶

In [53]:
mar01 = mar01.iloc[:20000]
In [54]:
df = mar01
In [55]:
df = df.loc[df['Language'] == 'en']
In [56]:
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
In [57]:
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply:   0%|          | 0/19983 [00:00<?, ?it/s]
In [58]:
mar01 = df
In [59]:
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
mar01['lemmas'] = mar01.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
In [60]:
mar01_word_counts = Counter(mar01.lemmas.sum())
In [ ]:
 

feb28¶

In [61]:
feb28 = feb28.iloc[:20000]
In [62]:
df = feb28
In [63]:
df = df.loc[df['Language'] == 'en']
In [64]:
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
In [65]:
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply:   0%|          | 0/19977 [00:00<?, ?it/s]
In [66]:
feb28 = df
In [67]:
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
feb28['lemmas'] = feb28.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
In [68]:
feb28_word_counts = Counter(feb28.lemmas.sum())
In [ ]:
 

feb27¶

In [69]:
feb27 = feb27.iloc[:20000]
In [70]:
df = feb27
In [71]:
df = df.loc[df['Language'] == 'en']
In [72]:
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
In [73]:
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply:   0%|          | 0/19974 [00:00<?, ?it/s]
In [74]:
feb27 = df
In [75]:
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
feb27['lemmas'] = feb27.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
In [76]:
feb27_word_counts = Counter(feb27.lemmas.sum())
In [ ]:
 
In [ ]:
 

feb26¶

In [77]:
feb26 = feb26.iloc[:20000]
In [78]:
df = feb26
In [79]:
df = df.loc[df['Language'] == 'en']
In [80]:
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
In [81]:
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply:   0%|          | 0/19979 [00:00<?, ?it/s]
In [82]:
feb26 = df
In [83]:
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
feb26['lemmas'] = feb26.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
In [84]:
feb26_word_counts = Counter(feb26.lemmas.sum())
In [ ]:
 

feb25¶

In [85]:
feb25 = feb25.iloc[:20000]
In [86]:
df = feb25
In [87]:
df = df.loc[df['Language'] == 'en']
In [88]:
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
In [89]:
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply:   0%|          | 0/19974 [00:00<?, ?it/s]
In [90]:
feb25 = df
In [91]:
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
feb25['lemmas'] = feb25.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
In [92]:
feb25_word_counts = Counter(feb25.lemmas.sum())
In [ ]:
 

feb24¶

In [93]:
feb24 = feb24.iloc[:20000]
In [94]:
df = feb24
In [95]:
df = df.loc[df['Language'] == 'en']
In [96]:
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
In [97]:
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply:   0%|          | 0/19978 [00:00<?, ?it/s]
In [98]:
feb24 = df
In [99]:
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
feb24['lemmas'] = feb24.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
In [100]:
feb24_word_counts = Counter(feb24.lemmas.sum())
In [ ]:
 
In [101]:
not_interesting = {'the', '@', 'a', 'this'}

lemmas_ngrams = mar02.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])

mar02_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
In [102]:
not_interesting = {'the', '@', 'a', 'this'}

lemmas_ngrams = mar01.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])

mar01_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
In [103]:
not_interesting = {'the', '@', 'a', 'this'}

lemmas_ngrams = feb28.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])

feb28_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
In [104]:
not_interesting = {'the', '@', 'a', 'this'}

lemmas_ngrams = feb27.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])

feb27_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
In [105]:
not_interesting = {'the', '@', 'a', 'this'}

lemmas_ngrams = feb26.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])

feb26_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
In [106]:
not_interesting = {'the', '@', 'a', 'this'}

lemmas_ngrams = feb25.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])

feb25_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
In [107]:
not_interesting = {'the', '@', 'a', 'this'}

lemmas_ngrams = feb24.Text_en.apply(lambda doc: list(doc.noun_chunks))
lemmas_ngrams = lemmas_ngrams.apply(lambda x: [''.join(str(el)) for el in x if len(el) == 2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if len(x.split())==2])
lemmas_ngrams = lemmas_ngrams.apply(lambda w: [x for x in w if x.split()[0].lower() not in not_interesting if x.split()[1].lower() not in not_interesting])

feb24_word_counts_ngrams = dict(Counter(lemmas_ngrams.sum()))
In [ ]:
 
In [109]:
all_word_counts = feb24_word_counts + feb25_word_counts + feb26_word_counts + feb27_word_counts + feb28_word_counts + mar01_word_counts + mar02_word_counts 
In [120]:
all_word_counts.most_common(10)
Out[120]:
[('Ukraine', 140474),
 ('Russia', 35135),
 ('Putin', 23274),
 ('war', 16895),
 ('russian', 16705),
 ('people', 16000),
 ('NATO', 10415),
 ('country', 10136),
 ('invasion', 8858),
 ('support', 8851)]
In [121]:
x = ['feb24', 'feb25', 'feb26', 'feb27', 'feb28', 'mar01', 'mar02']
counts_list = [feb24_word_counts, feb25_word_counts, feb26_word_counts, feb27_word_counts, feb28_word_counts, mar01_word_counts, mar02_word_counts ]
In [171]:
def plot_over_time(base, counts_list, start, stop):
    figure(figsize=(16, 10))

    for el in base.most_common()[start: stop]:
        y=[]
        for k in counts_list:
            y.append(k.get(el[0]))
        plt.plot(x, y, label = el[0] + " "+ str(el[1]))


    plt.legend()    
    plt.show()
In [172]:
plot_over_time(all_word_counts, counts_list, 0, 10)
In [173]:
plot_over_time(all_word_counts, counts_list, 1, 11)
In [174]:
plot_over_time(all_word_counts, counts_list, 11, 21)
In [175]:
plot_over_time(all_word_counts, counts_list, 21, 31)
In [176]:
plot_over_time(all_word_counts, counts_list, 31, 41)
In [179]:
plot_over_time(all_word_counts, counts_list, 41, 51)
C:\Users\jakub\anaconda3\envs\WBII\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning:

Glyph 128591 (\N{PERSON WITH FOLDED HANDS}) missing from current font.

teraz osobno¶

In [181]:
plot_over_time(feb24_word_counts, counts_list, 0, 10)
In [182]:
plot_over_time(mar02_word_counts, counts_list, 0, 10)

teraz z pominiciem topki z all¶

In [203]:
def plot_without_all(base, counts_list, start, stop, all_counts = all_word_counts):
    figure(figsize=(16, 10))
    
    a = all_counts.most_common(50)
    for i,l in enumerate(a):#wybieramy tylko wyrazy
        a[i] = l[0].lower()
    
    b = base.most_common()
    nb = []

    for w in b:
        if w[0].lower() not in a:
            
            nb.append(w)
    
    
    for el in nb[start: stop]:
        y=[]
        for k in counts_list:
            y.append(k.get(el[0]))
        plt.plot(x, y, label = el[0] + " "+ str(el[1]))


    plt.legend()    
    plt.show()
In [204]:
plot_without_all(feb24_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
In [205]:
plot_without_all(feb24_word_counts, counts_list, 11, 20, all_counts = all_word_counts)

kolejne miesiace¶

In [207]:
plot_without_all(feb25_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
In [206]:
plot_without_all(feb25_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
In [210]:
plot_without_all(feb26_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
In [211]:
plot_without_all(feb26_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
In [ ]:
 
In [212]:
plot_without_all(feb27_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
In [213]:
plot_without_all(feb27_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
In [214]:
plot_without_all(feb28_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
In [215]:
plot_without_all(feb28_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
In [216]:
plot_without_all(mar01_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
In [217]:
plot_without_all(mar01_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
In [218]:
plot_without_all(mar02_word_counts, counts_list, 0, 11, all_counts = all_word_counts)
In [219]:
plot_without_all(mar02_word_counts, counts_list, 11, 20, all_counts = all_word_counts)
In [ ]:
 

moze cos ciekawszego wyjdzie z 2¶

In [224]:
mar02_word_counts_ngrams = Counter(mar02_word_counts_ngrams)
mar01_word_counts_ngrams = Counter(mar01_word_counts_ngrams)
feb28_word_counts_ngrams = Counter(feb28_word_counts_ngrams)
feb27_word_counts_ngrams = Counter(feb27_word_counts_ngrams)
feb26_word_counts_ngrams = Counter(feb26_word_counts_ngrams)
feb25_word_counts_ngrams = Counter(feb25_word_counts_ngrams)
feb24_word_counts_ngrams = Counter(feb24_word_counts_ngrams)
In [225]:
all_word_counts_ngrams = mar02_word_counts_ngrams + mar01_word_counts_ngrams + feb28_word_counts_ngrams + feb27_word_counts_ngrams + feb26_word_counts_ngrams + feb25_word_counts_ngrams + feb24_word_counts_ngrams
In [226]:
x = ['feb24', 'feb25', 'feb26', 'feb27', 'feb28', 'mar01', 'mar02']
ngrams_list = [mar02_word_counts_ngrams , mar01_word_counts_ngrams , feb28_word_counts_ngrams , feb27_word_counts_ngrams , feb26_word_counts_ngrams , feb25_word_counts_ngrams, feb24_word_counts_ngrams]
In [227]:
def plot_over_time(base, counts_list, start, stop):
    figure(figsize=(16, 10))

    for el in base.most_common()[start: stop]:
        y=[]
        for k in counts_list:
            y.append(k.get(el[0]))
        plt.plot(x, y, label = el[0] + " "+ str(el[1]))


    plt.legend()    
    plt.show()
In [228]:
plot_over_time(all_word_counts_ngrams, ngrams_list, 0, 10)
In [229]:
plot_over_time(all_word_counts_ngrams, ngrams_list, 10, 20)
In [230]:
plot_over_time(all_word_counts_ngrams, ngrams_list, 20, 30)
In [231]:
plot_over_time(all_word_counts_ngrams, ngrams_list, 30, 40)

27 lutego wiadomość o starlinkach dla Ukrainy

In [232]:
plot_over_time(all_word_counts_ngrams, ngrams_list, 40, 50)

teraz dniami ale juz bez tych fraz co wyzej¶

In [239]:
def plot_without_all_ngrams(base, counts_list, start, stop, all_counts = all_word_counts_ngrams):
    figure(figsize=(16, 10))
    
    a = all_counts.most_common(50)
    for i,l in enumerate(a):#wybieramy tylko wyrazy
        a[i] = l[0].lower()
    
    b = base.most_common()
    nb = []

    for w in b:
        if w[0].lower() not in a:
            
            nb.append(w)
    
    
    for el in nb[start: stop]:
        y=[]
        for k in counts_list:
            y.append(k.get(el[0]))
        plt.plot(x, y, label = el[0] + " "+ str(el[1]))


    plt.legend()    
    plt.show()
In [240]:
plot_without_all_ngrams(feb24_word_counts_ngrams, ngrams_list, 0, 10)
C:\Users\jakub\anaconda3\envs\WBII\lib\site-packages\IPython\core\pylabtools.py:151: UserWarning:

Glyph 128591 (\N{PERSON WITH FOLDED HANDS}) missing from current font.

In [241]:
plot_without_all_ngrams(feb24_word_counts_ngrams, ngrams_list, 10, 20)
In [242]:
plot_without_all_ngrams(feb24_word_counts_ngrams, ngrams_list, 10, 20)
In [243]:
plot_without_all_ngrams(feb24_word_counts_ngrams, ngrams_list, 20, 30)
In [244]:
plot_without_all_ngrams(feb25_word_counts_ngrams, ngrams_list, 0 ,10)
In [245]:
plot_without_all_ngrams(feb25_word_counts_ngrams, ngrams_list, 10, 20)
In [246]:
plot_without_all_ngrams(feb26_word_counts_ngrams, ngrams_list, 0 ,10)
In [247]:
plot_without_all_ngrams(feb26_word_counts_ngrams, ngrams_list, 10, 20)
In [250]:
plot_without_all_ngrams(feb27_word_counts_ngrams, ngrams_list, 0 ,10)
In [251]:
plot_without_all_ngrams(feb27_word_counts_ngrams, ngrams_list, 10, 20)
In [252]:
plot_without_all_ngrams(feb28_word_counts_ngrams, ngrams_list, 0 ,10)
In [253]:
plot_without_all_ngrams(feb28_word_counts_ngrams, ngrams_list, 10, 20)
In [254]:
plot_without_all_ngrams(mar01_word_counts_ngrams, ngrams_list, 0 ,10)
In [255]:
plot_without_all_ngrams(mar01_word_counts_ngrams, ngrams_list, 10, 20)
In [256]:
plot_without_all_ngrams(mar02_word_counts_ngrams, ngrams_list, 0 ,10)
In [257]:
plot_without_all_ngrams(mar02_word_counts_ngrams, ngrams_list, 10, 20)
In [ ]:
 
In [260]:
plot_without_all_ngrams(mar02_word_counts_ngrams, ngrams_list, 20, 30)
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [36]:
df = df.loc[df['Language'] == 'en']
In [37]:
48001*7-len(df)
Out[37]:
428
In [38]:
df = df.drop(columns=['Unnamed: 0', 'Language'])
df = df.reset_index()
In [39]:
df["Text_en"] = df['Text'].swifter.apply(en)
Pandas Apply:   0%|          | 0/335579 [00:00<?, ?it/s]
In [40]:
df_copy = df

EDA¶

In [ ]:
def cloud_from_lemmas(word_counts):
    wc = WordCloud(width=800, height=400)
    wc.generate_from_frequencies(frequencies=word_counts)
    plt.figure(figsize=(10,8))
    plt.imshow(wc)

def plot_counts(counts):
    fig = px.bar(counts,orientation='h', y='word', x='count')

    fig['layout']['yaxis']['autorange'] = "reversed"
    fig.update_layout(bargap=0.30, font={'size':10})
    return fig
In [41]:
not_interesting = set(["\n", "\n\n", "🇺", "🇦", " ", "", '🇷', '👇', 'amp'])
df['lemmas'] = df.Text_en.apply(lambda doc: [token.lemma_ for token in doc if not token.is_stop if not token.is_punct if not token.lemma_ in not_interesting])
In [43]:
doc_lens = df["Text_en"].str.len()

doc_lens.hist(log_y=True)
In [44]:
fig, ax = plt.subplots(figsize=(19, 13))
ax.boxplot(doc_lens)
plt.show()
In [ ]:
word_counts = Counter(df.lemmas.sum())
In [ ]:
counts = pd.DataFrame(Counter({k: v for k, v in word_counts.items()}).most_common(60), columns=['word', 'count'])
In [ ]: